{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:39:43.333734Z",
     "start_time": "2020-02-10T12:39:43.319264Z"
    },
    "lines_to_next_cell": 0
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "54"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import json\n",
    "\n",
    "text = \"Sub-module available for the above is sent_tokenize. An obvious question in your mind would be why sentence tokenization is needed when we have the option of word tokenization. Imagine you need to count average words per sentence, how you will calculate? For accomplishing such a task, you need both sentence tokenization as well as words to calculate the ratio. Such output serves as an important feature for machine training as the answer would be numeric. Check the below example to learn how sentence tokenization is different from words tokenization.\"\n",
    "with open(\"cong_gpo_taxation_0_54.json\") as f:\n",
    "    json_corpus=json.load(f)\n",
    "len(json_corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:11:58.006214Z",
     "start_time": "2020-02-10T12:11:58.003101Z"
    },
    "lines_to_next_cell": 0
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "555"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:11:58.578351Z",
     "start_time": "2020-02-10T12:11:58.573747Z"
    },
    "lines_to_next_cell": 0
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "898549"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text_json=\"\".join(f\"START_ID_{article[0]} {article[2]}\" for article in json_corpus)\n",
    "len(text_json)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:11:59.087158Z",
     "start_time": "2020-02-10T12:11:59.083884Z"
    },
    "lines_to_next_cell": 0
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'text_json is 1619 bigger'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "f\"text_json is {len(text_json)//len(text)} bigger\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:12:00.677129Z",
     "start_time": "2020-02-10T12:11:59.591892Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nltk version: 3.4.5\n"
     ]
    }
   ],
   "source": [
    "import nltk\n",
    "from nltk.tokenize import sent_tokenize\n",
    "\n",
    "print(\"nltk version:\", nltk.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:12:14.952523Z",
     "start_time": "2020-02-10T12:12:01.361910Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "169 µs ± 3.02 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<TimeitResult : 169 µs ± 3.02 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%timeit -o nltk.tokenize.sent_tokenize(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:12:19.791413Z",
     "start_time": "2020-02-10T12:12:17.995582Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "225 ms ± 11.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<TimeitResult : 225 ms ± 11.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%timeit -o nltk.tokenize.sent_tokenize(text_json)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:12:25.553223Z",
     "start_time": "2020-02-10T12:12:25.541472Z"
    },
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "alphabets = \"([A-Za-z])\"\n",
    "prefixes = \"(Mr|St|Mrs|Ms|Dr)[.]\"\n",
    "suffixes = \"(Inc|Ltd|Jr|Sr|Co)\"\n",
    "starters = \"(Mr|Mrs|Ms|Dr|He\\s|She\\s|It\\s|They\\s|Their\\s|Our\\s|We\\s|But\\s|However\\s|That\\s|This\\s|Wherever)\"\n",
    "acronyms = \"([A-Z][.][A-Z][.](?:[A-Z][.])?)\"\n",
    "websites = \"[.](com|net|org|io|gov)\"\n",
    "\n",
    "\n",
    "def split_into_sentences(text):\n",
    "    text = \" \" + text + \"  \"\n",
    "    text = text.replace(\"\\n\", \" \")\n",
    "    text = re.sub(prefixes, \"\\\\1<prd>\", text)\n",
    "    text = re.sub(websites, \"<prd>\\\\1\", text)\n",
    "    if \"Ph.D\" in text:\n",
    "        text = text.replace(\"Ph.D.\", \"Ph<prd>D<prd>\")\n",
    "    text = re.sub(\"\\s\" + alphabets + \"[.] \", \" \\\\1<prd> \", text)\n",
    "    text = re.sub(acronyms + \" \" + starters, \"\\\\1<stop> \\\\2\", text)\n",
    "    text = re.sub(\n",
    "        alphabets + \"[.]\" + alphabets + \"[.]\" + alphabets + \"[.]\",\n",
    "        \"\\\\1<prd>\\\\2<prd>\\\\3<prd>\",\n",
    "        text,\n",
    "    )\n",
    "    text = re.sub(alphabets + \"[.]\" + alphabets + \"[.]\", \"\\\\1<prd>\\\\2<prd>\", text)\n",
    "    text = re.sub(\" \" + suffixes + \"[.] \" + starters, \" \\\\1<stop> \\\\2\", text)\n",
    "    text = re.sub(\" \" + suffixes + \"[.]\", \" \\\\1<prd>\", text)\n",
    "    text = re.sub(\" \" + alphabets + \"[.]\", \" \\\\1<prd>\", text)\n",
    "    if \"”\" in text:\n",
    "        text = text.replace(\".”\", \"”.\")\n",
    "    if '\"' in text:\n",
    "        text = text.replace('.\"', '\".')\n",
    "    if \"!\" in text:\n",
    "        text = text.replace('!\"', '\"!')\n",
    "    if \"?\" in text:\n",
    "        text = text.replace('?\"', '\"?')\n",
    "    text = text.replace(\".\", \".<stop>\")\n",
    "    text = text.replace(\"?\", \"?<stop>\")\n",
    "    text = text.replace(\"!\", \"!<stop>\")\n",
    "    text = text.replace(\"<prd>\", \".\")\n",
    "    sentences = text.split(\"<stop>\")\n",
    "    sentences = sentences[:-1]\n",
    "    sentences = [s.strip() for s in sentences]\n",
    "    return sentences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:12:36.209179Z",
     "start_time": "2020-02-10T12:12:30.521354Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "73.2 µs ± 1.14 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<TimeitResult : 73.2 µs ± 1.14 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)>"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%timeit -o split_into_sentences(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:12:46.105610Z",
     "start_time": "2020-02-10T12:12:39.146504Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "90.2 ms ± 1.62 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<TimeitResult : 90.2 ms ± 1.62 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%timeit -o split_into_sentences(text_json)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:12:51.490372Z",
     "start_time": "2020-02-10T12:12:50.275341Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "spacy version: 2.1.9\n",
      "model version: 2.1.0\n"
     ]
    }
   ],
   "source": [
    "import spacy\n",
    "import en_core_web_sm\n",
    "\n",
    "print(\"spacy version:\", spacy.__version__)\n",
    "nlp = spacy.load(\"en_core_web_sm\")\n",
    "\n",
    "print(\"model version:\", nlp.meta[\"version\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "See https://github.com/explosion/spaCy/issues/453 \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:12:53.053681Z",
     "start_time": "2020-02-10T12:12:53.049614Z"
    },
    "lines_to_next_cell": 0
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'accuracy': {'ents_f': 85.8587845242,\n",
       "  'ents_p': 86.3317889027,\n",
       "  'ents_r': 85.3909350025,\n",
       "  'las': 89.6616629074,\n",
       "  'tags_acc': 96.7783856079,\n",
       "  'token_acc': 99.0697323163,\n",
       "  'uas': 91.5287392082},\n",
       " 'author': 'Explosion AI',\n",
       " 'description': 'English multi-task CNN trained on OntoNotes. Assigns context-specific token vectors, POS tags, dependency parse and named entities.',\n",
       " 'email': 'contact@explosion.ai',\n",
       " 'lang': 'en',\n",
       " 'license': 'MIT',\n",
       " 'name': 'core_web_sm',\n",
       " 'parent_package': 'spacy',\n",
       " 'pipeline': ['tagger', 'parser', 'ner'],\n",
       " 'sources': ['OntoNotes 5'],\n",
       " 'spacy_version': '>=2.1.0',\n",
       " 'speed': {'cpu': 6684.8046553827, 'gpu': None, 'nwords': 291314},\n",
       " 'url': 'https://explosion.ai',\n",
       " 'version': '2.1.0',\n",
       " 'vectors': {'width': 0, 'vectors': 0, 'keys': 0, 'name': None}}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nlp.meta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:13:00.454894Z",
     "start_time": "2020-02-10T12:12:55.318532Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7.28 ms ± 444 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<TimeitResult : 7.28 ms ± 444 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)>"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%timeit -o doc = nlp(text,disable=['tagger', 'ner'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:14:33.043964Z",
     "start_time": "2020-02-10T12:13:10.482434Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10.3 s ± 210 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<TimeitResult : 10.3 s ± 210 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%timeit -o doc = nlp(text_json,disable=['tagger', 'ner'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Text Preprocessing\n",
    "1. lower\n",
    "2. remove punctuation\n",
    "3. remove numbers\n",
    "4. remove superfluous white spaces\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:14:36.349986Z",
     "start_time": "2020-02-10T12:14:36.344835Z"
    },
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "import string\n",
    "from string import digits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:14:36.946109Z",
     "start_time": "2020-02-10T12:14:36.703429Z"
    },
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "sentences = nltk.tokenize.sent_tokenize(text)\n",
    "sentences_json = nltk.tokenize.sent_tokenize(text_json)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:14:37.156633Z",
     "start_time": "2020-02-10T12:14:37.153425Z"
    },
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "sentences_empty=[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:14:37.727142Z",
     "start_time": "2020-02-10T12:14:37.722298Z"
    },
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "def original_text_preprocessing(sentences):\n",
    "    sentences = [sentence.lower() for sentence in sentences if bool(sentence) != False]\n",
    "    sentences = [sentence.translate(str.maketrans('', '', string.punctuation)) for sentence in sentences] # get rid of punctuation\n",
    "    sentences = [sentence.translate(str.maketrans('', '', digits)) for sentence in sentences] # get rid of numbers\n",
    "    sentences = [\" \".join(sentence.split()) for sentence in sentences] # get rid of superfluous white spaces\n",
    "    return sentences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:14:41.367176Z",
     "start_time": "2020-02-10T12:14:38.248247Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "37.1 µs ± 149 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<TimeitResult : 37.1 µs ± 149 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)>"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%timeit -o original_text_preprocessing(sentences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:14:44.800654Z",
     "start_time": "2020-02-10T12:14:41.369035Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "40.1 ms ± 508 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<TimeitResult : 40.1 ms ± 508 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)>"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%timeit -o original_text_preprocessing(sentences_json)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:15:06.574655Z",
     "start_time": "2020-02-10T12:15:06.567401Z"
    },
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append(\"../code\")\n",
    "from utils import preprocess"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:15:07.819492Z",
     "start_time": "2020-02-10T12:15:07.814856Z"
    },
    "lines_to_next_cell": 0
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['submodule available for the above is senttokenize',\n",
       " 'an obvious question in your mind would be why sentence tokenization is needed when we have the option of word tokenization',\n",
       " 'imagine you need to count average words per sentence how you will calculate',\n",
       " 'for accomplishing such a task you need both sentence tokenization as well as words to calculate the ratio',\n",
       " 'such output serves as an important feature for machine training as the answer would be numeric',\n",
       " 'check the below example to learn how sentence tokenization is different from words tokenization']"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preprocess(sentences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:15:22.397101Z",
     "start_time": "2020-02-10T12:15:08.563031Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "17.6 µs ± 199 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<TimeitResult : 17.6 µs ± 199 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)>"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%timeit -o preprocess(sentences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T12:15:24.911631Z",
     "start_time": "2020-02-10T12:15:22.398826Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "29.2 ms ± 75.7 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<TimeitResult : 29.2 ms ± 75.7 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)>"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%timeit -o preprocess(sentences_json)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
